{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6a63bd39-9b87-4316-8680-a9cf0672158a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dotenv import load_dotenv\n",
    "import os\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "openai_api_key=os.getenv('OPENAI_API_KEY', 'YourAPIKey')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dd94cd99-82bc-4cad-81c2-6cac78c18e70",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import DirectoryLoader\n",
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain.vectorstores import Chroma\n",
    "from langchain.document_loaders import WebBaseLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2bc6d009-fd25-41cb-b4bf-2d2dfb27751d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Your 1 documents have been split into 28 chunks\n"
     ]
    }
   ],
   "source": [
    "# Loading a single website\n",
    "loader = WebBaseLoader(\"http://www.paulgraham.com/wealth.html\")\n",
    "docs = loader.load()\n",
    "\n",
    "# Split your website into big chunks\n",
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)\n",
    "chunks = text_splitter.split_documents(docs)\n",
    "\n",
    "print (f\"Your {len(docs)} documents have been split into {len(chunks)} chunks\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0af17f88-e32c-455f-93b6-e747e0b15bdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding = OpenAIEmbeddings()\n",
    "vectordb = Chroma.from_documents(documents=chunks, embedding=embedding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7ebada96-e58c-4941-8b5d-8e95bc459683",
   "metadata": {},
   "outputs": [],
   "source": [
    "retriever_vanilla = vectordb.as_retriever(search_type=\"similarity\", search_kwargs={\"k\": 8})\n",
    "\n",
    "retriever_mmr = vectordb.as_retriever(search_type=\"mmr\", search_kwargs={\"k\": 8})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "181d06cb-ac3a-45c3-bb4b-162e0925d45b",
   "metadata": {},
   "outputs": [],
   "source": [
    "vanilla_relevant_docs = retriever_vanilla.get_relevant_documents(\"What is the best way to make and keep wealth?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "268d4aa1-9df5-47c2-af44-5460a0cd9343",
   "metadata": {},
   "outputs": [],
   "source": [
    "mmr_relevant_docs = retriever_mmr.get_relevant_documents(\"What is the best way to make and keep wealth?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b149fe0e-60aa-49c6-8721-d4a5ce838ef1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def analyze_list_overlap(list1, list2, content_attr='page_content'):\n",
    "    \"\"\"\n",
    "    Analyze the overlap and uniqueness between two lists of objects using a specified content attribute.\n",
    "\n",
    "    Parameters:\n",
    "    list1 (list): The first list of objects to compare.\n",
    "    list2 (list): The second list of objects to compare.\n",
    "    content_attr (str): The attribute name of the content to use for comparison.\n",
    "\n",
    "    Returns:\n",
    "    dict: A dictionary with counts of overlapping, unique to list1, unique to list2 items,\n",
    "          and total counts for each list.\n",
    "    \"\"\"\n",
    "    # Extract unique content attributes from the lists\n",
    "    set1_contents = {getattr(doc, content_attr) for doc in list1}\n",
    "    set2_contents = {getattr(doc, content_attr) for doc in list2}\n",
    "\n",
    "    # Find the number of overlapping content attributes\n",
    "    overlap_contents = set1_contents & set2_contents\n",
    "    overlap_count = len(overlap_contents)\n",
    "\n",
    "    # Find the unique content attributes in each list\n",
    "    unique_to_list1_contents = set1_contents - set2_contents\n",
    "    unique_to_list2_contents = set2_contents - set1_contents\n",
    "    unique_to_list1_count = len(unique_to_list1_contents)\n",
    "    unique_to_list2_count = len(unique_to_list2_contents)\n",
    "\n",
    "    # Use the unique content attributes to retrieve the unique objects\n",
    "    unique_to_list1 = [doc for doc in list1 if getattr(doc, content_attr) in unique_to_list1_contents]\n",
    "    unique_to_list2 = [doc for doc in list2 if getattr(doc, content_attr) in unique_to_list2_contents]\n",
    "\n",
    "    # Count the total number of items in each list\n",
    "    total_list1 = len(list1)\n",
    "    total_list2 = len(list2)\n",
    "\n",
    "    # Return the results in a dictionary\n",
    "    return {\n",
    "        'total_list1': total_list1,\n",
    "        'total_list2': total_list2,\n",
    "        'overlap_count': overlap_count,\n",
    "        'unique_to_list1_count': unique_to_list1_count,\n",
    "        'unique_to_list2_count': unique_to_list2_count,\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c546588b-3428-427e-b264-56c25222a84f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'total_list1': 8,\n",
       " 'total_list2': 8,\n",
       " 'overlap_count': 6,\n",
       " 'unique_to_list1_count': 2,\n",
       " 'unique_to_list2_count': 2}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "analyze_list_overlap(vanilla_relevant_docs, mmr_relevant_docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc7d4e89-964c-4f37-bfcc-69d846c6070f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
